{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn import metrics\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "from keras.models import Model\n",
    "from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding\n",
    "from keras.optimizers import RMSprop\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing import sequence\n",
    "from keras.callbacks import EarlyStopping\n",
    "## 设置字体\n",
    "from matplotlib.font_manager import FontProperties\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei'] \n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((7000, 5),\n",
       "         id label                                               text  \\\n",
       " 0        0    时政  韩美决定每年在黄海举行反潜联合军演(图)中新网10月19日电 据韩国《朝鲜日报》网站19日报...   \n",
       " 1        1    房产  王鹏：商业地产重要的三个环节董利：刚才蔡总介绍了一下台湾的便利店，其实便利店在改变着人们的生...   \n",
       " 2        2    房产  碧桂园股份掉期亏损净利降66%料今年销售额达190亿 杨国强对楼市表示“审慎乐观” 东方早报...   \n",
       " 3        3    房产  刘新虎：关于丽泽商务区的定位问题【刘新虎】：主要是为了学习，因为以前在丰台区开发过项目，20...   \n",
       " 4        4    财经  回顾2010年的邮市：各品种多版块都很给力老票精品很给力。2010年的春天，老票精品吹响了邮...   \n",
       " ...    ...   ...                                                ...   \n",
       " 6995  6995    科技  英媒刊登照片展示全球正在消融的冰川(图)新浪科技讯 北京时间4月30日消息，据英国《卫报》报...   \n",
       " 6996  6996    时尚  搭配QA：厚厚羽绒服怎么穿出流行感Q：冬天不得不穿上厚厚的羽绒服御寒，可以大部分羽绒服设计都...   \n",
       " 6997  6997    家居  加以零星别致的摆设(图)快乐来临时，一切都是美好的，清新的色调，加以零星别致的摆设，体现出充...   \n",
       " 6998  6998    时政  基地组织训练5岁娃娃兵练习射击充当人弹(图)中新网7月11日电 据英国媒体10日报道，英国一...   \n",
       " 6999  6999    时政  国务院修改发布外商投资电信企业管理规定新华社北京9月12日电国务院关于修改《外商投资电信企业...   \n",
       " \n",
       "                                                 cutword  cutwordnum  \n",
       " 0     韩美 黄海 反潜 联合 中新 中新网 新网 10 19 日电 韩国 朝鲜 日报 网站 19 ...         223  \n",
       " 1     商业 商业地产 地产 三个 环节 介绍 下台 台湾 便利 便利店 便利 便利店 改变 生活 ...         429  \n",
       " 2     碧桂园 桂园 股份 亏损 净利 66 年销售额 销售 销售额 190 杨国强 国强 楼市 审...         305  \n",
       " 3     商务 商务区 定位 定位问题 学习 丰台 丰台区 台区 开发 发过 项目 2006 撤出 撤...         496  \n",
       " 4     回顾 2010 邮市 品种 版块 精品 2010 春天 精品 吹响 邮市 反转 进军 军号 ...        1160  \n",
       " ...                                                 ...         ...  \n",
       " 6995  刊登 照片 展示 全球 消融 冰川 新浪 科技 北京 时间 30 消息 英国 卫报 报道 全...         439  \n",
       " 6996  搭配 QA 厚厚 羽绒 羽绒服 穿出 流行 冬天 厚厚 厚厚的 羽绒 羽绒服 御寒 大部 大...          89  \n",
       " 6997  零星 别致 摆设 快乐 来临 临时 一切都是 美好 清新 色调 零星 别致 摆设 体现 现出...          21  \n",
       " 6998  基地 组织 训练 娃娃 娃娃兵 射击 充当 中新 中新网 新网 11 日电 英国 媒体 10...         173  \n",
       " 6999  国务 国务院 修改 发布 外商 外商投资 投资 电信 企业 管理 新华 新华社 北京 12 ...         340  \n",
       " \n",
       " [7000 rows x 5 columns])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 读取测数据集\n",
    "# train_df = pd.read_csv('./cnews/cnews.train.txt', sep='\\t', names=['label','content'])\n",
    "# val_df = pd.read_csv(\"./cnews/cnews.val.txt\", sep='\\t', names=['label','content'])\n",
    "# test_df = pd.read_csv(\"./cnews/cnews.test.txt\", sep='\\t', names=['label','content'])\n",
    "train_df = pd.read_csv(\"../cnews/train_data_com3.csv\")\n",
    "train_df.columns = ['id', 'label', 'text', 'cutword', 'cutwordnum']\n",
    "val_df = pd.read_csv(\"../cnews/cnews_val.csv\")\n",
    "test_df = pd.read_csv(\"../cnews/cnews_test.csv\")\n",
    "train_df.shape, train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-2-36e029c93983>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m## 查看训练集都有哪些标签\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0msns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcountplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxlabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Label'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxticks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'train_df' is not defined"
     ],
     "ename": "NameError",
     "evalue": "name 'train_df' is not defined",
     "output_type": "error"
    },
    {
     "data": {
      "text/plain": "<Figure size 432x288 with 0 Axes>"
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "## 查看训练集都有哪些标签\n",
    "plt.figure()\n",
    "sns.countplot(train_df.label)\n",
    "plt.xlabel('Label',size = 10)\n",
    "plt.xticks(size = 10)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                id   cutwordnum\n",
      "count  7000.000000  7000.000000\n",
      "mean   3499.500000   361.875571\n",
      "std    2020.870275   414.731533\n",
      "min       0.000000     5.000000\n",
      "25%    1749.750000   107.000000\n",
      "50%    3499.500000   260.000000\n",
      "75%    5249.250000   485.250000\n",
      "max    6999.000000  7784.000000\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxQAAAIqCAYAAAC9hAz4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAABYlAAAWJQFJUiTwAABD8ElEQVR4nO3debxcRZ3//9cnCSQsguyg4ARHkUVEBRXBH6ijA4I4IiAioKCOC44bLoC4ocDgwojCl8UVRgQXRFRQEEGCw6KyDYqCAgYYlhhlEUhIIPn8/qhq0mm6k77n3uQu/Xo+Hv0499apc7r6cDv0u+tUVWQmkiRJktTEpNFugCRJkqTxy0AhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEjSBBURe0bE08dAO14QEZsP4/iNImLniNi4x/4VI2LdBuedFBHTImJK07b1+TyviYi1u5TvGxFbLcvnlqTlwUAhSRNQRGwEfBu4PCK2XkbPsWVEfDUinrGUqu8Efh8Rb+vzvBtGxOkRsVot2gn4GfBvdf+OEfHfEbFJ3b8lcGdEnD3El7A7MBc4fIjH9S0iVgVOBW6JiBe0lb8YOB34ZUQ8a1k9vyQtDwYKSZqAMvMO4E3AmsAlEfHS9v31W/3fRcQpSzpPRPw0Ih6LiOiye3/gbcCRSzh+ErAL8DBwVp/NfwmwD3B2PX5+LX+kbt8F7A3Mq79vCUwB7u7z/C1z6/bhtvZuGBE5hMfRS3mODwNrAD/OzN+2CjPzCkqQWQM4dojtlqQxZZl280qSRk9mfjci5gPfA86NiJ0y87K6b35EbMqiD+tExJ7AfsARmXltLX4YmJuZ2X7uepvQvsAcyofmVvlatWx+Zi4AtgfWB07MzPvb6k0CVgBWBh7IzIVt7f5ORGwIfJ4SWB6sux6t3+bvCXwtM2+r5a0emH4DS0srUDzWVtYKKRcD31/CsWtRgtRDvSpExBbAofXXX0XEazuq/BE4Fzizy74pwDTgpvYgIkljkYFCkiawzPxhRLwFOA3YC7isbfdcFn2oBtiQclvRyW1lC+qj0+uBpwAfr70hLX9r/dDRqXFQRBzUo5kbAzPrMQFMBb4I/B8lDO1e6y2kBKATgBMiYlpmPgL8C/AP4H96nL+X7FLWCli/y8yTu+yntnM6JVAs7LF/ZeA7wIq1aEk9Qa9ewr7PAgYKSWOagUKSJrjM/FZEzMrMn3fseoTFw8KjQzjtB4A7gC8ARMS2wLOB/0fp1Xi0nvswSsj4Ssfxkyj/D1qFEgZa1mPRrUuPUcYftG7P/Uo952TgfcBtdSzCZsBtwOEdIWYSpQfklMz8c23nmkBQeiJa/w+cFBHTgJUYuVuBT6FcD4BXADd07H8+cB7wI8oYk1569oBI0lhhoJCkAdAlTED5Nr79E3i3nogniIhXAdsAb649BAAfpNyKdFBmnlTrvQT4BHBSZn6mz6bOBb5O+cA/n9Jj0prd6efA7ymBYgXKrVWvqvv+Cfhkj3OeC/y5/nwGZZB3u2NZNI7hwLrdMiKW9EF/rbp9wv9HI+Jgyq1jSbm+ewH/X0e1Dev2GSweKILy2iZl5qFI0jhgoJCkCSYizgHWoXzD37qt5x2ZeVNH1dYH3qGceyrwX8BvgG/VsmcCrwX+F/hBvd3nEeAttQ1f6/f8mfkAZdwEdVzB+4BZlJ6LsygfxL+SmbNqnV/U1/H0zJxZy9YGZlNmUXoHi/e8fAe4khJYNqIM8D4HuJzSm/GHWu/l9bE0q7T/EhG7AJ+j9Lr8EHgzJWg91nHcCnX7z3QPFI+xaPyFJI1pBgpJmnjWATagfJDeAHgSiz7AtluJIQSKOr7hNGBTygxSm0fESsCnKLcKvZ3Si9C5tsJdbbciHZGZn+rjufan9FTcSAkwX6V8m38w8No6a9WqwMsogeKetsM3qNt7M3NO+3kz89S253gpJVDMyMzjatm0uvtLmfn+trqbU25bWqy8i60p13RfSi8OwCsy87qO17ctcAXwo8x8wxLOJ0ljnoFCkiaYzNy+9XNEfIfFp1httxJ93uZUz5ttC+X9d8fur2TmbyLiBEqA2ZYycPs44CZgE8q4iyWOCahjHD4HvJUyC9K/ADvU3XdQQst/U2ZgmsGiMQ+bA9fUn9ev27v6fW11vYiP0XsK11a7H+6xH4DM/ExE/CAz/xARrUDxzoi4p6Nq65anZ0fEpzr2TaH0lpyamdf39QIkaRQZKCRpMHTecgNlWtJuQWNJPkoZUHw1ZRamH1HWujgMIDO/VnsyrgJuAT6UmQtqb8AHerSj3f6UW6UuBvbMzPsionVb0UqZeXJdqO86ygxIrUHaz+GJgeIvfb6mVwAfotxWdUkt27bjg/6qdbt9W3nr9qR5mXlEq2Jm/oHFDeWWp9a+lSmByUAhacwzUEjSYOhcR2Jdyrf7DwzpJJm/AH5Rz/Eu4FnA3pl5b1u1f6eEjgPqWhRDOf+XIuJ3wCWZubCOSXgbcGDrdqXMfH9EvA9YF/gy8G7Kehen1tO0vv2/udfzRMQKlCABsCvwK2BnFt069aL66LRjfbS7DTiiS91W78mQbnmqa3RMA6ZFxAqZOZTZtyRpuTNQSNJg2qJuh7q6NPD4QOzPA2dm5vfayl9M+ZC/EHhJ/R3KmhUAe9UF9SZT1mj4S/uYijqG4VpgpYiYR1ll+8WU6WjbzQDOBo6hzKDU/iF/G0rPS+dUra3z70NZpfqfa/EJwHs7bul6b2Ye33bc+pRr9fgYkLbF+RYbmN2mNR7j2u4LjQOwd0Ts3WsnZYzIJUvYL0mjzkAhSYOpFSh+P9QD63iD71HWlziorXxFyriHxyi3A72WRWM0Wgu8bUn5ID+llnUu2vYG4Jtdnvb0iDi97ffHgPUy896IuBD4SERslpl/BF4IXJOZ3W7nugjYjjIL02mUWZhuaVsJ/Kl1+9eeF6Cqq3vPo/dtY2vU7VGU28P6FZTxLdPo/7YtSRo1BgpJGhARMSUzW/fyt2ZiunqI51ibMiD6uZSeiA9FxBaUQdGrZ+b6EfF8YGZmzm877qXAL4FPtGZU6uFGykDuuZRxEbtSpnX937p/TeA9wMVtt1mdBXwE2D8ifkC55ek7Pc5/ImV9i/2AZ1ICRbtn1O2s2gszuf6+dmtbe1ha/pGZvQZ/n0cZR/KNzFwsoETEfcDUzFy5x7GSNG4YKCRpMATwo4j4EnAh5Vai2SwayNyqszSbAS+tP7+XMjbjFsrtRdcBZOafACLicODYtsXvqOVTKDMqndj5QTszr6SsE0FEXE7pBXlTZj5Yyw6pVU9rO+a3EXEtpbekNSD7hz3a/13KbVoLa2DotG3d/p4yIHqDjv3vro+Wb1IGkbde2+qUmajmUnpR7gde1+WWpwQWLGHxvCnA1Pr4emvdDUkaiwwUkjQY3k8JEXdRPsw+hfLNeftg7cldjltMZv4qIo6kLFx3BXB1XYxuMRHxbuBIyroMr+vYvR9lVevtImKnjja0jt+NMnbicuDFNVysSFmH4kZKMGh3HCVkHAhcm5mX92j/0maZ2hb4v8z8W0Q8RFmj4qVd2jeNEho6p5FdgzLtbb9O6qPOTymL+0nSmDRp6VUkSeNYa2Dweyk9CO8DWt/ydw50XpEnCjp6LjLz45l5VGZenJkPRMTqEfH6iPjviFgtIl5CWYxuNt1Xez6NsgDeKym3KnWzkBIanglcQPmm//eUmZ3O6DJ71PeB1iJ2vW53WqI6NmQL4H/a2rA0i7UjM2dmZnQ+gH+i9HjMpwS72yghYUvK+hr3Am/udmznDFGSNNYYKCRpYntW3f6NMkh6F8picTMy85qOuitSBhgv6ChbLGhExJSIeGFEHBIRv6zn/i5l9qRnU8Y8zAd2ad3+RFsoqT0Sb6VMWfuZiOhcWZvMPK9Op7oeZbzGn1l0+9GnI+KqiNiutmdF4GuUtRsADmubXWoodqf00vyowbFdRcRTI+IIShh6GrB7Zv6stT8zf0+ZnvZ64LSIuCEi3hURnbdaSdKY5S1PkjRB1Q/Vm1JucXoT5d/8r1ACw/s762fmMZRpWNtNpS1QRMRJlMXnWlOl3g18HTiX8i37WZSVsnfJzKvazvO0un2sPtf/RcRHKb0k34yIF/RYs+KfgaPr6zgD+ATllql3An+LiE1q+daUUDODMvD65xGxR2b+vPcVenwWptbg8YMotzH9rK3OU3qMc2j9/3OxL+bqVLLPA15OCXAvpoSp84B3ZOadbcdNrtfi7oh4OSWQfbS2/8S6HselwE8y84IlvA5JGlUGCkmauH4LfJsyJuBnEbEhcDowfwi30awITIqIyfUD/+xa9m3KQnIX1+lTiYhTKbck7ZWZF9Wysym3LbUGQP+57dwnUwYwX0bpXWgNvF6F0otyALAb5VamdwJfqb0bR9Rg8wHKmIoVgP8EPl5X5V4X+BRwfkScAhyVmY9P2xoRBwE7UaaXhbJOxP6U8ROntI0JmUIJNEsa5zCt4/dnU26ZapVfRFm74lcd9aZQpoYFHu+1OSMizqTMbPVWSm/SZsB/L+H5JWnURZexcJKkCaKuCJ19DEbudfz6wLTMnFl/fwrlhE+YKjUipgIvb7+lp/ZCHAn8idKT8Jn2QdgRsUpmPtz2+9MoszxtQLkl6kTKTFF/b6uzKmWw9paUW4nemZmXdbTlQOB4SgDaqe3WKyLiVZQw8xvgu5l5Vp3x6QDgW5l5Y613N3DTUgZln56Z+3fsOwhYC/heZt7UeWyt8wCwGrBq++vvqLM6sGVm/k+3/ZI0VhgoJEnLTO1toNeH5h7HvApYB/h+Zs7tUecZlFWkv9HjViki4lnAipn5uyE3vBy/LvBY23oXkqQuDBSSJEmSGnOWJ0mSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1NiU0W6AICL+QlkxdeYoN0WSJEkT23TgH5m58Uid0EAxNqy20korrbnZZputOdoNkSRJ0sT1xz/+kblz547oOQ0UY8PMzTbbbM2rr756tNshSZKkCWzrrbfmmmuumTmS53QMhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWpsQgSKiDgoIrLPut+LiJldyjeKiDMjYnZE3BcRJ0TEtKb1JEmSpEEwZbQbMFwR8Xrg+D7rHgDsBdzWUb42cCmwAXAccD/wEWANYN+h1pMkSZIGxbgNFBExCfgMcAhwD/CUpdR/OvBloFtPxieB6cBrMvMntf51wM8i4pTMvHSI9SRJkqSBMJ5vedoSeDuwO3DhkipGxGTgdODvwI869gWl1+K6VkgAyMzzgT8Bew6lniRJkjRIxm0PBXAHsHlmzo6IPZZS92PAC4AdgHd07FsXWA/4dpfjrgK2HmK9niLi6h67Nl3ascvS9EPP61o+85hdl3NLJEmSNN6M2x6KzLw3M2cvrV5EbEsJFEdk5hVdqqxZt7d22TeLcovTUOpJkiRJA2M891AsVUSsSrnV6TLg6B7VWqHqoS775gCrD7FeT5nZtRej9lw8f2nHS5IkSWPNhA4UlNmf1gBempkLe9SZU7fRZd8kYNoQ60mSJEkDY8IGijqu4gDKmIlH6pSvAFOBSfX3R4G7KDM/bdzlNOsCD9Sf+60nSZIkDYwJGyiA3er2lProNBuYkZkvjYgbgB271NmWEiTIzHn91JMkSZIGybgdlN2HzwGv7PL4OWUQ9SuBD9a6PwB2iIhtWgdHxEuALYAL2s7Zbz1JkiRpIEzYHorM/APwh87yiNgPeFZm/qKt+ATKmhY/iogPU4LWF4CHgZMb1JMkSZIGwkTuoehbZv4N2IlyG9S3gW/VXXtn5s1DrSdJkiQNignRQ5GZB1AGYPdbt1v57yLi+cCLgCcBl2fmE6aI7beeJEmSNAgmRKAYKXVq2W6L3zWqJ0mSJE103vIkSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJamxCBIqIOCgisse+10TEVRExNyLmR8Q1EfGyLvW2iIjzIuL+iJgVEUdExBOuT7/1JEmSpEEwZbQbMFwR8Xrg+B77XgecBfwvcDgwDfgAcEFEbJOZ19d6mwCXUgLWF4CpwCGU63N42/n6qidJkiQNinEbKGqvwGcoH+jvAZ7SsX9FStA4H3hNZj5Wyy8Hfgm8EzioVj8WWA14YWZeW+vdCZwQEV/LzL8MsZ4kSZI0EMbzrTpbAm8Hdgcu7LL/ycCXgXe3wkR1Vd2uBxARqwE7A+e2QkL1DWBuPX/f9SRJkqRBMm57KIA7gM0zc3ZE7NG5MzP/Cny2y3Hb1u11dbsZ5Tpc3HH8/Ii4Hth6iPV6ioire+zadGnHSpIkSWPRuA0UmXlvw0MPAR4BTqu/r1m3t3apOwuYPsR6kiRJ0sAYt4GiiYh4G/AK4IjMvL0Wt277eqjLIXOA1YdYr6fM7NqLUXsunr+04yVJkqSxZjyPoRiSiNiKMqbiMuDItl1zWlW6HDaJMjPUUOpJkiRJA2MgAkVErAf8GLgX2LNjkPaddbtxl0PXBR4YYj1JkiRpYEz4QBERqwLnAmsBu2XmPR1VbqGEgR07jpsMbAPcNcR6kiRJ0sCY0IGihomfAc8FXt8x3SsAmbkAOAfYIyKe1rZrH8q4iAuGUk+SJEkaJBN9UPYpwEsooWLNiNivbd9DmXlO/floYE/g/Ig4HFgf+BxwN3BG2zH91pMkSZIGwkQPFK31KV5VH+1uo/Q4kJl/iohdKFPJnl333wLs2z49bb/1JEmSpEExIQJFZh4AHNClvO+ZlzLz0ojYBNiOcivY5Zk5r2k9SZIkaRBMiEAxUjLzUWDGSNWTJEmSJroJPShbkiRJ0rJloJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjU0a7ARq7ph96XtfymcfsupxbIkmSpLHKHgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYxMiUETEQRGRPfZtFBFnRsTsiLgvIk6IiGnLup4kSZI0CMb9StkR8Xrg+B771gYuBTYAjgPuBz4CrAHsu6zqSZIkSYNi3AaKiJgEfAY4BLgHeEqXap8EpgOvycyf1OOuA34WEadk5qXLqJ4kSZI0EMbzLU9bAm8Hdgcu7NwZEQHsBVzX+vAPkJnnA38C9lwW9SRJkqRBMm57KIA7gM0zc3ZE7NFl/7rAesC3u+y7Cth6GdXrKSKu7rFr06UdK0mSJI1F47aHIjPvzczZS6iyZt3e2mXfLMqtS8uiniRJkjQwxnMPxdK0wtJDXfbNAVZfRvV6ysyuvRi15+L5SztekiRJGmvGbQ9FH+bUbXTZNwmYtozqSZIkSQNjIgeKu4AENu6yb13ggWVUT5IkSRoYEzZQZOY84AZgxy67t6UEhBGvJ0mSJA2SCRsoqh8AO0TENq2CiHgJsAVwwTKsJ0mSJA2EiR4oTqAsevejiHhjROwHnAU8DJy8DOtJkiRJA2FCB4rM/BuwEzCbsn7Et+quvTPz5mVVT5IkSRoUE2La2Mw8ADigx77fRcTzgRcBTwIuz8wnTP060vUkSZKkQTAhAsXSZOZC4IrlXU+SJEma6Cb0LU+SJEmSli0DhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIamzLaDdD4M/3Q87qWzzxm1+XcEkmSJI02eygkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNTfhAERGTI+JjEXFjRMyNiLsj4pyIeE5HvS0i4ryIuD8iZkXEERHxhOvTbz1JkiRpEEwZ7QYsB8cBbwOOB24CNgL+HbgsIp6XmTdHxCbApZSA9QVgKnAI5foc3jpRv/UkSZKkQTGhA0VErAK8E/hcZrYHg3OB3wJvBD4NHAusBrwwM6+tde4EToiIr2XmX+qh/daTJEmSBsKI3qoTEf8cETtExOQGxz4lIp4xku0BVqWEpgc7yh+p2/kRsRqwM3BuKyRU3wDmArvX9vVVT5IkSRokI91D8U7gYGAVYMEQj/0YsFNEvCYzbxiJxmTmrIi4FvhARPwGuAL4J+BESqj4IbAZ5Tpc3HHs/Ii4Hti6FvVbr6eIuLrHrk37flGSJEnSGDLSgWIOEMDJEXEfcBfwF+Bm4IbMfLTbQRHxSuAdwMPA/BFu067A+cBFbWV3A6/IzJsi4lW17NYux84Cptef1+yzniRJkjQwRjpQLKzbN7WVZd3Oi4hrgP8BzsnMKwEiYlvgB7XOezLzzyPcprcBzwH+AFwJPBV4JXB0ROzOotu+Hupy7Bxg9fpzv/V6ysyuvRi15+L5SztekiRJGmuWxaDsBNYFNgDWr48NgW2AFwIfAT5cw8V3gcMoYx0+mZmnjWRDImJz4Ajgm8DbMnNhLd8RuJAyU9O3WtW7nGISMK3+PKfPepIkSdLAWCazPGXm34G/A7/v3BcRzwfeDRzIom/l35WZpyyDpvwrJQB8qRUmavtmRMQM4OXAMbV4Y+CSjuPXBR6oP9/ZZz1JkiRpYAxrlqeIOCsivtzvzE4R8SzgdSyaEWku5QP/DhHR7Zv/4Wqdc8Uu+1amBKpbKGFgx462Tqb0qtxVi/qtJ0mSJA2MxoEiItYBXgP8B/BL4G/Am+u+jSJixYh4bkS8KSJOiYgbKeMYPkoZqP1mYG3gTOANlPUgRlprAPVbO9q+K7At8NvMXACcA+wREU9rq7YPZVzEBQD91pMkSZIGyXBuebqf8qF8S8qUqf9CmVoVSmBIFgWWhcD1wOeBszLzqtZJIuIA4FnAYRFxXmuw9gg5H/gj8I46k9SfKWM6tgLmUaaqBTga2BM4PyIOr3U+R5kN6oy28/VbT5IkSRoIjXsoMvPRzLwmM0/LzPdm5hbAUyi9AZdQbjcKynoUJwC7ZOah7WGidR7KitWPUaabHbFbnzJzHuUWpVOAFSih5+m1fa9qrXeRmX8CdgFWAs6mrFMxC9g9M+9tO19f9SRJkqRBMaxB2XVl63mZeUdETAUuB87MzFdExPrAAcD7gfcCG0bE2zLzgY5zbEu5XeqLlBmg3gJ8fTjtapeZsykL7i2t3qURsQmwHSVoXV4DSaN6kiRJ0iAY7ixPHwdeHxHfAP6LsrjbynXfQ8AOlJDwZGAG8NeIuAg4DfheZiZwPKVn47mUb/9/Ocw2NVZ7S2aMVD1JkiRpohtuoPg6MBV4O2WF68eAR+uA7Z9RpoW9MTMPBoiIFYCdgZ2Aj0fEmZTxF5/MzNkRsV1mPjzMNkmSJElaToYVKDLzUqB1C9B8yq1FC4B3AJsD+2fmtzsO+zJlitWDKTM7JXB6PZ9hQpIkSRpHhjNt7PSIeG9EvB3YnjL4eTLwbOAO4FhglYh4b8eh92bm5yiB455adkTTdkiSJEkaPcPpoXgWcBylh6F9ZqZd6qNVlhFxH+V2qHbzKCtM3wTsGxGXZubXhtEeSZIkScvZcALFDcC/U251erQ+zqBMz3oPZY2Hh4BVKLdCbUsJH1tFxCosmiXpDcB/Ap+PiB9k5n3DaJMkSZKk5Wg461D8X2Z+nTL+YVXKugwLKYvarQd8AJiRmd8EjgROovRavJaygvUngNmZeT3wPkrwOLjxK5EkSZK03DUOFAARsSpwAWXhuL0oYyjOAV4HbAL8IiJWz8yfUUIDwE+BmynjLi4GyMybgXOBd9f1LCRJkiSNA8MKFJReh5cDHwbOotxCtWJm/gh4G7AF8P1atxUU/jcztwdeQ53dqTqVsl7FXsNskyRJkqTlZLjrUBwKnJuZ323rWZgKkJnfiogdgQMi4qXAdXX/SnX/uR3n+gVwNzB7mG2SJEmStJwMdx2KO4Hv1p/nRcQGwNy2Kh8Grs/MS2rgOBC4sce55kTEtpl5x3DaJEmSJGn5GW4PxWIyc1bH7/dRFrIjM+cBp/U6NiJWpix4J0mSJGmcGO6g7OdGxNERsWFb2YoRMbmPYzsHX58BfKcO9JYkSZI0Dgx3UPYmwCHAWm1lPwLmR8T8iJjT5fFIRDwGPBwRzwGIiMMog7Q3oUwfK0mSJGkcGO4tT4/U7byO8tvpMVaiwz8iYnvg08BMYKfO26YkSZIkjV3DDRStILGgo/yHmbnUReoi4knAL+t5XpuZ9wyzPZIkSZKWo5EalL2wW2FEbATsQlk9+3bg75nZPi3sQ8DXgb/WFbM1jk0/9Lyu5TOP2XU5t0SSJEnLy0gEigA+HhGXA78DprXtewVl8bt8vHLEo5Rw8VvgW5l55Ai0QZIkSdIoGKkeijfVRys4XFu3cyhTxc6nBI8VgfWBzYF9gDdExC+BfR07IUmSJI0/IxEoEngJJTBsSpn1qezI/C514btOEbEl8AlgD+CKiHhpZt4+Au2RJEmStJwMN1C0pp39e2b+Cbg8IvYCdoyI/1rKsZmZe0XEQZTF734WES/MzIeH2SZJkiRJy8lwA0VrvETneZ5XH0uSwAcz88SIWB04CjgBOHCYbZIkSZK0nIxEoAhgpbay99TzzueJ08kCTAZWYPEF7D4HHAC8KSJOycwrh9kuSZIkScvBcFfKvgh4GXB7RLwpIrbMzJuBO4FzgVdm5m3AbOBUYEfgjsy8KTOvaZ0kMxcAxwMXA1cNs02SJEmSlpNhBYrM/GtmzgDmAt8Edqq75lEGaK9cf9+OEia+CfwpIt7U5XTfAd6YmY8Np02SJEmSlp/h9lC0PEK59WkuQGbOr+WP1u2vgBcDxwJrAt+MiB9HxJNbJ8jMv3UseidJkiRpjGscKCJiSkSsGhGTWbRSdmfvwsJWaMjMX2fmR4CNKb0RrwZ+FRFPadoGSZIkSaNrOD0U/wI8QBl83eqJODEiFkTEAsosTlOAvwNz2spnA2+o9begTBf7pGG0Q5IkSdIoGc4sT/cBlwIPU3omsm1fALtRei6uoKyYPZdFPRktawA7UAZs7zGMtkiSJEkaBY0DRWb+hjLDU1cRsRBYmJkviYiVM3NOj3rnA6+NiAMy89Sm7ZEkSZK0/A13HQoi4rXAZixad6K9FyLq9sKImAecBJxdp4ltOQj4A/DZiDg7M/8x3DZJkiRJWj6GHSiAfwPe3KU8gckRMZWykN2LKVPH3hMRJwJHZ3FrRJwF7AP8B3D0CLRJkiRJ0nIwEtPGnkAZL7ET8ErKYO2dKL0TK2bmvMx8ITAd+CSlB2PXzGwfc/H1Wn+jEWiPJEmSpOVk2D0UmXl1Z1lErAx8F7i1rd7twJER8VnK7E7tZgBnZea7htseSZIkScvPSNzy9AR1APY+PfY9ClzXUbYwIt64LNoiSZIkadkZqZWyhy0zOxfFkyRJkjTGjZlAIUmSJGn8MVBIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqbJmsQyG1m37oeV3LZx6z63JuiSRJkkaaPRSSJEmSGjNQSJIkSWpsoAJFRKwWETMj4g8RMbVj3/YRMSMiHoyI2yLi3T3O0Vc9SZIkaRAM2hiK44ANge0zc16rMCK2By4C/gp8utY5PiLmZ+ZXh1pPkiRJGhQDEygiYjfgQOCzmfnrjt0nAXMpQeOOWn8ucGxEnJGZDw+xniRJkjQQBuKWp4hYG/gq8Efgkx37Nge2BE5thYTqS8CTgJ2GUk+SJEkaJIPSQ3EisB5wFLBLRNwJ/DYzkxISAC5uPyAz746Iu4CtgbOHUK+niLi6x65Nh/BaJEmSpDFjwgeKiNgF2Kv+eiTwGLAmcEVEvLH+DHBrl8NnAdPrz/3WkyRJkgbGINzy9Nm6fQ+wVmauBbyU0itwLouuwUNdjp0DrF5/7rdeT5m5dbcHcGNfr0SSJEkaYyZ0oIiITYBnA+dn5gmZ+RhAZs6g9FZswaKeh+hyiknAtPrznD7rSZIkSQNjQgcKYK26/UWXfa1egdvrduMuddYFHqg/39lnPUmSJGlgTPRA0QoBC7vse0rdPgIksGP7zohYB3g6cFcturbPepIkSdLAmNCBIjNvp0wV+8aImNwqj4gpwDsovQo/AS4F3hIR7eMg3kG5vemCeq7Z/dSTJEmSBsmEn+UJOBj4MXBlRHybEqL2A54H/EdmzomIT1Cmg/1pRBwNbAV8Avgd8PO2c/VbT5IkSRoIEz5QZOb5EbEDcAhwGLAacAvw1sz8Rq1zaUS8HjiZMvMTlFuc9s7M+W3n6queJEmSNCgmfKAAyMwrgd2XUufsiDgf2A6YC1yZmQua1pMkSZIGwUAEin5l5hy6zwjVqJ4kSZI00U3oQdmSJEmSli0DhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMamjHYDNLimH3pe1/KZx+y6nFsiSZKkpuyhkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktTYwAWKiJgcEZdHxCUd5VtExHkRcX9EzIqIIyLiCden33qSJEnSIJgy2g0YBR8HXgzMaBVExCbApZSA9QVgKnAI5focPtR6kiRJ0qAYqEARES8GPgZkx65jgdWAF2bmtbXuncAJEfG1zPzLEOtJkiRJA2FgbtWJiCcBpwNXA1e0la8G7Ayc2woJ1TeAucDuQ6knSZIkDZJB6qH4MrAO8K/A19vKN6Nch4vbK2fm/Ii4Hth6iPV6ioire+zatJ8XIEmSJI01AxEoImJP4ABg/8y8JSLad69Zt7d2OXQWMH2I9SRJkqSBMeEDRUQ8FTgFOD0zT+9SpXXb10Nd9s0BVh9ivZ4ys2svRu25eP7SjpckSZLGmgk9hiJKV8RpwP3AQT2qzWlV77JvEjBtiPUkSZKkgTHReygOBl4G7AZMjYiptXwFgIhYG/h7LdsYuKTj+HWBB+rPd/ZZT8M0/dDzeu6becyuy7ElkiRJWpoJ3UNBCRKTgPOA2W2P7epjNvBhShjYsf3AiJgMbAPcVYtu6bOeJEmSNDAmeg/FB4E1upQf27b/LmABsEdEfCIzb6/79qGMi7gAIDMXRMQ5S6snSZIkDZIJHSgys+s0rRFxX93/i/r70cCewPkRcTiwPvA54G7gjLZD+60nSZIkDYSJfstTXzLzT8AuwErA2cCJlKlgd8/Me4daT5IkSRoUE7qHopfMfGmXsksjYhPK2IpJwOWZOa9pPUmSJGkQDGSg6CUzHwVmjFQ9SZIkaaLzlidJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNTRntBkhDMf3Q87qWzzxm1+XcEkmSJIE9FJIkSZKGwUAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGjNQSJIkSWrMQCFJkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKmxKaPdAGkkTD/0vK7lM4/ZdTm3RJIkabDYQyFJkiSpMQOFJEmSpMYMFJIkSZIaG4hAERFviYgbImJ+RMyLiBkRsVVHne1r+YMRcVtEvLvHufqqJ0mSJA2CCT8oOyLeD3wRmAGcDKwHvB+4JCI2z8y7I2J74CLgr8CngQ2B4yNifmZ+te1cfdWTJEmSBsWEDhQRsQ5wNPDVzHx7W/nNwDeB/YHPAScBc4HtM/OOWmcucGxEnJGZD9dD+60nSZIkDYSJfsvTk4CjgI90lF9Vt+tFxObAlsCprZBQfakevxNAv/UkSZKkQTKheygy81ZKoOi0bd1eRwkJABd3HHt3RNwFbA2cPYR6PUXE1T12bbqk4yRJkqSxakIHim4iYgXgYGAWcA6wX911a5fqs4Dp9ec1+6ynMcQF7yRJkpatgQsUwCeBzYADM/PBiGjd9vVQl7pzgNXrz/3W6ykzt+5WXnsunr+04yVJkqSxZqKPoVhMROwMHAZ8LzNPrcVzWru7HDIJmDbEepIkSdLAGJhAERGbAmcCvwfe0rbrzrrduMth6wIPDLGeJEmSNDAGIlBExAbAT4FHgFd3TO96LZDAjh3HrAM8HbhriPUkSZKkgTHhA0UNExcD61DCRPuUr2TmbOBS4C0R0T4O4h2U25suGEo9SZIkaZAMwqDssyjTsv43sFlEbNa2b1ZmXgh8ghI6fhoRRwNb1bLfAT9vq99vPUmSJGkgTOhAERHrA9vVX99UH+1mABdm5qUR8XrgZODcuu9aYO/MnN+q3G89SZIkaVBM6ECRmffQfVambnXPjojzKQFkLnBlZi5oWk+SJEkaBBM6UAxVZs4BfjFS9TR2ueCdJEnSyJjwg7IlSZIkLTsGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDVmoJAkSZLUmIFCkiRJUmNTRrsB0lgy/dDzupbPPGbX5dwSSZKk8cEeCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjblSttQHV9CWJEnqzh4KSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjTnLkzQMzv4kSZIGnT0UkiRJkhozUEiSJElqzEAhSZIkqTEDhSRJkqTGHJQtLQMO1pYkSYPCHgpJkiRJjRkoJEmSJDVmoJAkSZLUmGMopOXIsRWSJGmisYdCkiRJUmMGCkmSJEmNGSgkSZIkNeYYCmkMcGyFJEkarwwU0jjUK4CAIUSSJC1f3vLUQERsERHnRcT9ETErIo6ICK+lJEmSBo49FEMUEZsAl1LC2BeAqcAhlGt5+Cg2TRPQknoiRvKYbuzpkCRJ/TBQDN2xwGrACzPzWoCIuBM4ISK+lpl/GdXWSZIkScuRgWIIImI1YGfg3FaYqL4BfB7YHfiv0WibNNKGOlDcgeWSJA2myMzRbsO4EREvAq4E3puZx3fsuwyYmZn7LuH4q3vs2mqllVaavNlmm41cY4fg93c+MCrPK3Xz7Keu3rW8yd9pr3MN1VCfe6SeV5KkkfbHP/6RuXPn3puZa43UOe2hGJo16/bWLvtmAdMbnnfB3LlzH7jmmmtmNjx+ODat2xtH4bnHM69bM0u9btfMGrknG8lzjfLz+vfWnNeuGa9bM163ZrxuzTW5dtOBf4xkIwwUQ9OayemhLvvmAEv8WjIztx7xFg1Tq9dkLLZtLPO6NeN1a8br1pzXrhmvWzNet2a8bs2NlWvnVKdDM6duo8u+ScC05dgWSZIkadQZKIbmzrrduMu+dQEHI0iSJGmgGCiG5hZKaNixvTAiJgPbAHeNRqMkSZKk0WKgGILMXACcA+wREU9r27UPZfzEBaPRLkmSJGm0GCiG7mjKGIrzI2L3iHgXcBJwN3DGqLZMkiRJWs5ch6KBiNgBOI1F08TeAuybmb8etUZJkiRJo8BA0VBErABsR+nluTwz541ykyRJkqTlzkAhSZIkqTHHUEiSJElqzEAhSZIkqTEDhSRJkqTGDBSSJEmSGpsy2g2QpEEREdMp/+7+pS6UKUnSuGcPxYCKiC0i4ryIuD8iZkXEERExcH8PEXFQRHSd6iwiNoqIMyNidkTcFxEnRMS0ZV1vLIuIt0TEDRExPyLmRcSMiNiqo872tfzBiLgtIt7d41wjWm8si4jdIuIO4C/An4G/RsRBHXX6ek+OdL3xIiImR8TlEXFJR7nXrU1ErBARcyMiuzx2bqvn+3QJImK1iJgZEX+IiKkd+7x2VURM7/G31v54aa3re7VN/TftYxFxY33P3h0R50TEczrqjYvrZg/FAIqITYBLKYHyC8BU4BDK38Pho9i05SoiXg8c32Pf2pRrtAFwHHA/8BFgDWDfZVVvLIuI9wNfBGYAJwPrAe8HLomIzTPz7ojYHrgI+CvwaWBD4PiImJ+ZX20714jWG8si4kXA2cAPKNdvEvAp4P9FxM2Z+fN+35MjXW+c+TjwYsrfHzDy12OCXLfnAdOAI4CbO/b9L4z8+28ivE+7OI7yOrZvX2fKa/cEs4H9e+zbH3gZMNP3alfHAW+jfA65CdgI+Hfgsoh4XmbePK6uW2b6GLAH8BPgUeB5bWXvBB4DNh7t9i2H1z8JOKq+3jvL2+AJdY4HEtitrWznWrbDsqo3Vh/AOsAc4Csd5QfU1/CR+vv1wH3ARm11jgH+AazSVjai9cbyg/KP92+BSW1lTwYWAKfU3/t6T450vfHyoASJx4CFwCXL6npMhOtGCfkLgCctoY7v0yVfw93qv2vHeO0aX8N162v4Qv3d9+ri12eV2v6jOsq3qX97nxhv123UL6qP5fsAVqt/TD/sKF8ReBA4eLTbuByuwVaUb1V2A06lI1AAAdwDXNvl2JuALy+LemP5ATyd8u3FkzvKn13/8TsW2Lz+/MWOOhvU8tfV30e03lh/ADsCW3SUrUL50HdSv+/Jka43Xh7Ak4BbgF8Dl1EDhdet5/X6PvDbJez3fbrk67d2/ff6D8BUr13j63gKpddldd+rXa/PevW/8aEd5a3/px463q7buLnXTCNmM0rX1sXthZk5n/JNydaj0ajl7A5g88z8SY/961Le7Bd32XcVi67RSNcbszLz1sw8KjPv79i1bd1eB2xZf+7827obuItFr3Ok641pmTkjM2/oKD6c0lP2I/p/T450vfHiy5QesjdS/mfY4nXrbjtg1Yj4Xb0v+86IODEi1qn7fZ8u2YmUf69PAnaJiBdGRNR9Xrs+RMQzgbcCR2bmA/hefYLMnAVcC3wgIl4eEStFxKaUv79HgB8yzq6bgWLwrFm3t3bZNwuYvvyaMjoy897MnL2EKv1eo5GuN65ExArAwZTXcA5et6WKiKMi4jeU8TMfyczz8br1FBF7Um6rOygzb+nY7XXrEBFPB55C+RLjQuCDwLmU+7QvjogpeN16iohdgL3qr0cCX6P2jEWZoc1r159DKOMEv1Z/97p1tyulN+wiyi3FfwSeAbwiM29inF03B2UPnlaIfKjLvjmU7slB1+81Gul6480nKd94HJiZD7bNEuF1621TYGPK7U6r1jKvWxcR8VTKbROnZ+bpXap43Z5oIfAJ4MzMfHxAdkT8Gvg68Fq8bkvy2bp9D3ByZj4WETtSvi0+l9JrAV67nur7dn/g6MycU4v9m+vubcBzKLfXXQk8FXglcHRE7M44u272UAye1hs8uuybRJkdZND1e41Gut64EWX6ycOA72XmqbXY67YUmbkHZSaPk4BPRMR/4HV7gnqLyWmUbzkP6lHN69YhM2dm5mfaw0T1TeBhYCe8bl3V2W+eDZyfmSdk5mNQblmk9FZswaJveL12vb2L0vaT2sr8m+sQEZtTZmL7JrBlZr41M3cGXk6ZhOILjLPrZqAYPHfW7cZd9q0LPLAc2zJW3UUZFLW0azTS9caFep/nmcDvgbe07er3b2uk640rmfkI8AHgb5Qpg71uT3QwZbrJ9wBTI2LtOvXyCsAK9ee/17pet6XIMvJyDiXM+vfW3Vp1+4su+26s29vr1mvXRe2lfjNwQWb+tW2Xf3NP9K+UD/ZfysyFrcIaYGdQgsW4um4GisFzC+WPZsf2woiYTJmu7K7RaNRYkmXO8RvouEbVttRrNNL1xoOI2AD4KWXQ2Ksz8+G23ddSglPn39Y6lFmi7lpG9casKItjHRUR27aX1w9491Jm1+j3PTnS9cay3Sj/fzqPMiNb67FdfcwGPozXbTER8faI+EaX8g0pA9v/iu/TXlofthZ22feUun0Er92SvIKyjsa3Osr9N+6JWr0EK3bZtzJlSMK4um4GigGTmQsoA2j3iIinte3ah3L/3AWj0a4x6AfADhGxTasgIl5C6fa+YBnWG7NqmLiY8sHk1Zl5R/v+OtD9UuAtEdF+L+Y7KP94XrAs6o1xD1LuJ/5ytK22GxHPA54J/Krf9+RI1xvjPki5l7jzcX19vBL4T7xunZ4MHBgRL2sV1A8LX6i//tj3aXeZeTtlUOwb6zUDoA5kfwflg9hP8Notyd6UmdjOay/037iuWgOj39peGBG7Ur5o/O24u27DmXPWx/h8AJtQBuX8Adidcs/jg5R0uuZot285X4tT6b6w3dr1etxJma5yP8psDA8Bz1hW9cbyg7IGQFLubd+v4/HKWmcHygI5l1FmsPgo0JqSbsW2c41ovbH8APagfOt5HWXRscMo3xTfDjy11unrPTnS9cbbA7iExRe287otfn2eTJkW+yHgG5SV2a+v79sfUhdX9H3a8/rtXNv92/pePRi4pl6/d3vtlnr97gQu67HP9+ri12NqbXtSeg7Op/w/Iik9YVuMt+s26hfVx+g86j9if6l/vAncDLxotNs1CtfhVLoEirpvy7Y3eFICwK7Lut5YfADrt7W72+OStrqvo3xgbu27Bnhml3OOaL2x/KB8UPmf+g/3/1GmU9ywo05f78mRrjeeHnQECq9b12v0NOAMyhiTh4HfUAfKdtTzfdr9+m1LCV+zgLnUsWJeu6Vet61q249cQh3fq4u3fx3gZMqXS49SVhb/JfCy8Xjdop5cA6iuI7Ad5da3y7Pc6682dZDZiyir9V6emd2mWxvxeuNdRKxM+duaC1yZpat1mdcb7/p9T450vfHO69aM79PmvHbN+F5tZjxcNwOFJEmSpMYclC1JkiSpMQOFJEmSpMYMFJIkSZIaM1BIkiRJasxAIUmSJKkxA4UkSZKkxgwUkiRJkhozUEiSFhMRz16G515lWZ2743m2i4hDI+JlS6n3whF8zudHxBciYsOROqckjQcGCknS4yLig8D1EfHqjvInRcTUiIhhPsW1EXF2RGw6zPMszYuB/wRe3qtCRLwEuCwivh8RTxqB5zwY+CDwvhE4lySNG1NGuwGSpDFlBpDA/4uISzLzoVp+IfAigCFkig9k5nGtX2rPxzOB1YG/tVeMiKnAYX2c85bM/FYf9eZ2bBcTEdOB71P+P7gRMA14sI/zLskpwL7A/hFxWGY+1vZ8k4AV6/PMzcx5w3wuSRozDBSSpMdl5lUR8Q3gbcAngI/UXd8HLgXmAwsooaOXbYBdgTkd5XvW7Uczc7FAkZnzIuIwyofuJfkR0E+gWNixfVxErAP8HFgfuAj4t8x8eGknjIiLWEKPR5v1gEeXELz2Ab7Tx3kkaVwwUEiSOn2K8k374zLz2H4Pjoh3UgLFo21lk4G3ADcCp/Y49CHgwcyc3uWc04G/AMP6Zj8i1gIuoPSUXAC8NjMf6fPw1nMfBzwwxKeezKIeij8P8VhJGtMMFJKkxWTmnRGxWWbe1iqLiNUpvRPzMrPbt/5TgJUz8x/tp2r7eVfKrUX7ZeaCeovTmcCHMvPWWucxlm5JPSNLFBHrAr8AtgTOA/YY4q1HrfZ9MTNvbzvvT4GnAy/IzMVum4qIe4BHM3Ojpu2WpLHOQdmSpCdoDxPVBZRbmBZERHY+KL0R1y/hlB+lfDPfutVnR2B34MqIWLX1tCP3ChYXEVsBv6aEiXOA1zUYx5Ad25Z1gWfRfbzGFJZ+G5ckjWv2UEiSiIhdKIOlH20rvigz76s/t8ZQzKOMoVjscGAFetyOFBF7UQZ0H5iZrWPfUrcfbBv4vSx9BphOeR1vbB8w3cDCiDieErDmUcZiAHysy7iJaZQQdgTl/7nTgHMz85fDeH5JGlMMFJIkKIOvd+wo2xq4D4Y2hqJdvbXpBOB+4C8R8XJgDeB1wPldZmx6ckR8qsupntzk+dvsQ3mNn24LNUPV6tVfALyTEijmU4IYtazTSnX7Lso4ipWA2wEDhaQJw0AhSQL4N2B+Zs6NiO8Ae1M+LD+urtuwdo/jf5uZd3YW1tmbbgNeAFzStuthun8AXx345FAaHhFHAW8CHmFR78mT6/bgiDigrfreS5n2dgVgZeCIzDy586nq9tHMXKHt+X9DGT+xPp0HlDEUj3QbaC5JE4WBQpJEZnabtWh+x+9H0Hva1N2AJwSK6kjKB/xfU76hvxL4WJdxGgC3LWWWp25WovR6PMKigdMrtW2fXH+eTAlE86k9L51PQ/n/4krA1C77W2MhOq/L5B7tanG8oqQJzUAhSeql84PzHGA28Jy2sn8DTmYJ07lm5o/h8aljLwKuBb48Uo3MzIMpq1Q/rk5dexJwVGYeU8tWoISeFYBnNBi70QoZndPMTqvn7zWo/K9DfB5JGlcMFJKkfs0DFmbmPa2CiGj1bPQzQ9PngW2B53WbenZZy8xHI+IM4H2UhfuOG+IpVgMezsxHO8pPAdZcwnGuii1pQjNQSJKWqYjYEDgWeD1wIfCqiHg7ZV2Kp1LWg7irVl8rIjrHLgA8aYSaczxwEPDJiPhuZt49hGPXoN4qFRErUW6BmgecMBoBSZLGCgOFJGkkLGmk8/qUMAHwyvoAuBu4g8WnoV0VeMeIt67KzFsi4kRKL8XXImK3IYSB9Vi01sa7KT0uACxloDeUsR3Pz8zfDbHJkjTmGSgkSUsUEWtl5t8pg49X7ZjW9dl123PgcWZeFRGXAbdRBmb/GrihxxiGJoOy+xIRkyjjID5FGfuxC2VK24P6OHY9yliJ1qrelwGHUsaVfJLSW3E4i6/jAeXWqq2BzxomJE1UBgpJUk8R8S7ggxHxXEpoWIXu07qu0KXscZn5kpFvXf8iYl3gVOCvmXlAROwN/Ap4V0RMAd6zlJWzt6jbmwEy8wrginruf9Rz/3Nmvr/tOf8VeB4lfHxqBF+OJI0pTmUnSerlM8CJlAHXa1OmU52VmdF6UBaMg0XTtA5JRDxtRFq65OfYG/g98Cpgv4jYKjN/A+xL6VH4d+CKiHjeEk7z/9XtZZ07MvM04HvA+yLiQ/U5t65lt1PGiAxnZW5JGtPsoZAkdVqnbvcDLgd2y8x7I+J8ypSv7S4EXgz8ua3sCesyRBlksBHwXMpMTy+g3Ao0ieGvgt3NGnX7IWCt+vPPgA9l5h8AMvOsiJgPfJvSk3BVRJwO/Fdm/m/H+V5LGQfxPz2e70Dgn4DP12DyauAhYOfMnDUyL0mSxiYDhSTpcRGxBYu+jb8IeE1mzgHIzP/qrF/HVvy9o/i5dbuwnnMGJUC092IsBG4EZoxU21tqeNmt/roWcC/wvsw8vbNuZv44Irah9CY8h7Li9qYRsXNmtmZ0ekF9TRcvYe2K+ZTxGNsAb6xlx1PW7ZCkCc1AIUlq9xgwl3Jrz+NhYmki4lXAm4GnUXosAG6q2xuBHSi9G+cBvwCubn04j4g7KNOvrg1kRNzY5SlaYzR2iYjbgVUzs9faDyuwqJfkSmDvzLy9V9sz86YaKj4AvA74l8x8uK3KoXX71bbXOxXYjBI0/gX4V2Bd4H7g58DLKIO0D4uIG4DrKAO6Z1ICzl2ZeVWvNknSeBKZ/axFJEkaFBHxMuA3HR+ql3bMkylTwK5Kmc3pi5n5pbpvU2CFXrMc1UHNCyhhZkG3Om0mU74MWyUzV1xCezYEPgq8PzM7V/zuW50Z6hhgL+BZlKljL6UEp9Y4xH8AFwA/BM7JzLkRsSJletxdKeHiWSw+te4bMvO7TdslSWOJgUKSNCIiYlvgb5l582i3ZaRFxIqtYBIR+wAvBK4GrgL+tLS1LCJidco4jc0pYeSjLoYnaaIwUEiSJElqzGljJUmSJDVmoJAkSZLUmIFCkiRJUmMGCkmSJEmNGSgkSZIkNWagkCRJktSYgUKSJElSYwYKSZIkSY0ZKCRJkiQ1ZqCQJEmS1JiBQpIkSVJjBgpJkiRJjRkoJEmSJDX2/wNXfNVo+A0mLAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "image/png": {
       "height": 277,
       "width": 394
      },
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "## 分析训练集中词组数量的分布\n",
    "print(train_df.describe())\n",
    "plt.figure()\n",
    "plt.hist(train_df.cutwordnum,bins=100)\n",
    "plt.xlabel(\"词组长度\",size = 12)\n",
    "plt.ylabel(\"频数\",size = 12)\n",
    "plt.title(\"训练数据集\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "y contains previously unseen labels: '体育'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32md:\\python\\lib\\site-packages\\sklearn\\preprocessing\\_label.py\u001b[0m in \u001b[0;36m_encode_python\u001b[1;34m(values, uniques, encode)\u001b[0m\n\u001b[0;32m     65\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 66\u001b[1;33m             \u001b[0mencoded\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     67\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\python\\lib\\site-packages\\sklearn\\preprocessing\\_label.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m     65\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 66\u001b[1;33m             \u001b[0mencoded\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     67\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: '体育'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-12-937a77001c8d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mLabelEncoder\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[0mtrain_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mval_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mval_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      8\u001b[0m \u001b[0mtest_y\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mle\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\python\\lib\\site-packages\\sklearn\\preprocessing\\_label.py\u001b[0m in \u001b[0;36mtransform\u001b[1;34m(self, y)\u001b[0m\n\u001b[0;32m    275\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    276\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 277\u001b[1;33m         \u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muniques\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    278\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    279\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\python\\lib\\site-packages\\sklearn\\preprocessing\\_label.py\u001b[0m in \u001b[0;36m_encode\u001b[1;34m(values, uniques, encode, check_unknown)\u001b[0m\n\u001b[0;32m    111\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mvalues\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    112\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 113\u001b[1;33m             \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_encode_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muniques\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencode\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    114\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    115\u001b[0m             types = sorted(t.__qualname__\n",
      "\u001b[1;32md:\\python\\lib\\site-packages\\sklearn\\preprocessing\\_label.py\u001b[0m in \u001b[0;36m_encode_python\u001b[1;34m(values, uniques, encode)\u001b[0m\n\u001b[0;32m     67\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     68\u001b[0m             raise ValueError(\"y contains previously unseen labels: %s\"\n\u001b[1;32m---> 69\u001b[1;33m                              % str(e))\n\u001b[0m\u001b[0;32m     70\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0muniques\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mencoded\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     71\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: y contains previously unseen labels: '体育'"
     ]
    }
   ],
   "source": [
    "## 对数据集的标签数据进行编码\n",
    "train_y = train_df.label\n",
    "val_y = val_df.label\n",
    "test_y = test_df.label\n",
    "le = LabelEncoder()\n",
    "train_y = le.fit_transform(train_y).reshape(-1,1)\n",
    "val_y = le.transform(val_y).reshape(-1,1)\n",
    "test_y = le.transform(test_y).reshape(-1,1)\n",
    "\n",
    "## 对数据集的标签数据进行one-hot编码\n",
    "ohe = OneHotEncoder()\n",
    "train_y = ohe.fit_transform(train_y).toarray()\n",
    "val_y = ohe.transform(val_y).toarray()\n",
    "test_y = ohe.transform(test_y).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.1 Jieba分词\n",
    "# import jieba\n",
    "# def chinese_word_cut(mytext):  \n",
    "#     return \" \".join(jieba.cut(mytext))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df_cutword =train_df['content'].apply(chinese_word_cut)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('我们', 1)\n",
      "('中国', 2)\n",
      "('一个', 3)\n",
      "('基金', 4)\n",
      "('可以', 5)\n",
      "('市场', 6)\n",
      "('没有', 7)\n",
      "('银行', 8)\n",
      "('自己', 9)\n",
      "('他们', 10)\n",
      "===================\n",
      "('韩美', 167)\n",
      "('黄海', 168)\n",
      "('反潜', 100)\n",
      "('联合', 3885)\n",
      "('中新', 622)\n",
      "('中新网', 2338)\n",
      "('新网', 396)\n",
      "('10', 31441)\n",
      "('19', 5977)\n",
      "('日电', 4646)\n"
     ]
    }
   ],
   "source": [
    "## 使用Tokenizer对词组进行编码\n",
    "## 当我们创建了一个Tokenizer对象后，使用该对象的fit_on_texts()函数，以空格去识别每个词,\n",
    "## 可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小。\n",
    "max_words = 5000\n",
    "max_len = 600\n",
    "tok = Tokenizer(num_words=max_words)  ## 使用的最大词语数为5000\n",
    "tok.fit_on_texts(train_df.cutword)\n",
    "\n",
    "## 使用word_index属性可以看到每次词对应的编码\n",
    "## 使用word_counts属性可以看到每个词对应的频数\n",
    "for ii,iterm in enumerate(tok.word_index.items()):\n",
    "    if ii < 10:\n",
    "        print(iterm)\n",
    "    else:\n",
    "        break\n",
    "print(\"===================\")  \n",
    "for ii,iterm in enumerate(tok.word_counts.items()):\n",
    "    if ii < 10:\n",
    "        print(iterm)\n",
    "    else:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(57000, 600)\n",
      "(5000, 600)\n",
      "(10000, 600)\n"
     ]
    }
   ],
   "source": [
    "## 对每个词编码之后，每句新闻中的每个词就可以用对应的编码表示，即每条新闻可以转变成一个向量了：\n",
    "train_seq = tok.texts_to_sequences(train_df.cutword)\n",
    "val_seq = tok.texts_to_sequences(val_df.cutword)\n",
    "test_seq = tok.texts_to_sequences(test_df.cutword)\n",
    "## 将每个序列调整为相同的长度\n",
    "train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)\n",
    "val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)\n",
    "test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)\n",
    "\n",
    "print(train_seq_mat.shape)\n",
    "print(val_seq_mat.shape)\n",
    "print(test_seq_mat.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"functional_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "inputs (InputLayer)          [(None, 600)]             0         \n",
      "_________________________________________________________________\n",
      "embedding (Embedding)        (None, 600, 128)          640128    \n",
      "_________________________________________________________________\n",
      "lstm (LSTM)                  (None, 128)               131584    \n",
      "_________________________________________________________________\n",
      "FC1 (Dense)                  (None, 128)               16512     \n",
      "_________________________________________________________________\n",
      "dropout (Dropout)            (None, 128)               0         \n",
      "_________________________________________________________________\n",
      "FC2 (Dense)                  (None, 10)                1290      \n",
      "=================================================================\n",
      "Total params: 789,514\n",
      "Trainable params: 789,514\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "## 定义LSTM模型\n",
    "inputs = Input(name='inputs', shape=[max_len])\n",
    "## Embedding(词汇表大小,batch大小,每个新闻的词长)\n",
    "layer = Embedding(max_words+1, 128, input_length=max_len)(inputs)\n",
    "layer = LSTM(128)(layer)\n",
    "layer = Dense(128,activation=\"relu\", name=\"FC1\")(layer)\n",
    "layer = Dropout(0.5)(layer)\n",
    "layer = Dense(10,activation=\"softmax\", name=\"FC2\")(layer)\n",
    "model = Model(inputs=inputs,outputs=layer)\n",
    "model.summary()\n",
    "model.compile(loss=\"categorical_crossentropy\",optimizer=RMSprop(),metrics=[\"accuracy\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/3\n",
      "446/446 [==============================] - 775s 2s/step - loss: 0.6811 - accuracy: 0.7862 - val_loss: 0.4110 - val_accuracy: 0.8956\n",
      "Epoch 2/3\n",
      "349/446 [======================>.......] - ETA: 2:50 - loss: 0.2432 - accuracy: 0.9431"
     ]
    }
   ],
   "source": [
    "## 模型训练\n",
    "model_fit = model.fit(train_seq_mat, train_y, batch_size=128, epochs=3,\n",
    "                      validation_data=(val_seq_mat,val_y),\n",
    "                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 当val-loss不再提升时停止训练\n",
    "                     )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## 对测试集进行预测\n",
    "test_pre = model.predict(test_seq_mat)\n",
    "\n",
    "## 评价预测效果，计算混淆矩阵\n",
    "confm = metrics.confusion_matrix(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1))\n",
    "## 混淆矩阵可视化\n",
    "Labname = [\"体育\",\"娱乐\",\"家居\",\"房产\",\"教育\",\"时尚\",\"时政\",\"游戏\",\"科技\",\"财经\"]\n",
    "plt.figure(figsize=(8,8))\n",
    "sns.heatmap(confm.T, square=True, annot=True,\n",
    "            fmt='d', cbar=False,linewidths=.8,\n",
    "            cmap=\"YlGnBu\")\n",
    "plt.xlabel('True label',size = 14)\n",
    "plt.ylabel('Predicted label',size = 14)\n",
    "plt.xticks(np.arange(10)+0.5,Labname,size = 12)\n",
    "plt.yticks(np.arange(10)+0.3,Labname,size = 12)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(metrics.classification_report(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('../cnews/test_data.csv')\n",
    "test_seq = tok.texts_to_sequences(test_df.cutword)\n",
    "test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_seq_mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict = model.predict(test_seq_mat)\n",
    "predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.argmax(predict[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictionn = []\n",
    "for i in range(len(predict)):\n",
    "     predictionn.append(int(np.argmax(predict[i])))\n",
    "prediction = pd.Series(predictionn)\n",
    "prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['label'] = prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[\"体育\",\"娱乐\",\"家居\",\"房产\",\"教育\",\"时尚\",\"时政\",\"游戏\",\"科技\",\"财经\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "财经、时政\t高风险              6,9\n",
    "房产、科技\t中风险              3,8\n",
    "教育、时尚、游戏\t低风险       4,5,7\n",
    "家居、体育、娱乐\t可公开       2,0,1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fenlei(x):\n",
    "    if x == 0:\n",
    "        return '体育', '可公开'\n",
    "    if x == 1:\n",
    "        return '娱乐', '可公开'\n",
    "    if x == 2:\n",
    "        return '家居', '可公开'\n",
    "    if x == 3:\n",
    "        return '房产', '中风险'\n",
    "    if x == 4:\n",
    "        return '教育', '低风险'\n",
    "    if x == 5:\n",
    "        return '时尚', '低风险'\n",
    "    if x == 6:\n",
    "        return '财经', '高风险'\n",
    "    if x == 7:\n",
    "        return '游戏', '低风险'\n",
    "    if x == 8:\n",
    "        return '科技', '中风险'\n",
    "    if x == 9:\n",
    "        return '时政', '高风险'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fin = test_df['label'].apply(func=fenlei)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_label_a = []\n",
    "rank_label_a = []\n",
    "for i in range(len(fin)):\n",
    "    class_label_a.append(fin[i][0])\n",
    "    rank_label_a.append(fin[i][1])\n",
    "class_label = pd.Series(class_label_a)\n",
    "rank_label = pd.Series(rank_label_a)\n",
    "rank_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['class_label'] = class_label\n",
    "test_df['rank_label'] = rank_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_fin = test_df.drop(['content', 'cutword', 'cutwordnum', 'label'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_fin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_fin.to_csv(\"../cnews/test_fin2.csv\", index=False, sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}